cd "C:\Users\remij\Desktop\Replication Files JOEG\Stata"
set more off

*************************************
*** MAIN CROSS-SECTIONAL DATA SET ***
*************************************

* We start with "grid_africa_01182018.dta", which is created in the subfolder "Grid Africa"
* The data set is created based on the main Africa-wide data set used by Jedwab & Moradi 2016. Their replication files include their main data set for Africa "africa2".
* Jedwab, Remi and Alexander Moradi, “The Permanent Effects of Transportation Revolutions in Poor Countries: Evidence from Africa,” Review of Economics and Statistics, 2016, 98 (2), 268–284.
use "Grid Africa\grid_africa_01182018.dta", clear
count
* 203,823 cells
codebook country
* 43
tab country
* Includes some countries we won't use in the end: South Africa, Lesotho and Swaziland, and Madagascar

***** Longitude, latitude *****
gen longitude = centroidx
gen latitude = centroidy
codebook longitude latitude
drop centroid*

***** Beach and Roome mission dummies *****
gen beach03_pro_yn = missioncr_dummy
gen roome24_mis_yn = missionnunn_dummy
drop *dummy

***** Controls: Area and rainfall *****

** Area **
* Grid's land area in sq km
gen area_sqkm = grid_land
drop if area_sqkm == 0
count
* 203,574
* We drop the cells with only water. 

** Rainfall **
gen longitude_rain = round(longitude,0.5)
gen latitude_rain = round(latitude,0.5)
* We add the rainfall data. 
* From the dta "rainfall_africa" which we had created for Jedwab & Moradi 2016.
sort longitude_rain latitude_rain
merge longitude_rain latitude_rain using rainfall_africa
tab _m
drop if _m == 2
save grid_africa_final.dta, replace
* 4000 observations or so with missing rainfall data (likely due to the GIS files used and coastal cells only).

use grid_africa_final, clear
keep if dist2coast <= 20 & prec_mean != . 
keep longitude latitude prec_mean gridcell
count
* About 3500 observations are within 20 km from the coast. 
ren longitude longitude_coast
ren latitude latitude_coast
save grid_africa_rain, replace
* For these, we use the value of the closest cell with a non-missing value. 

* To do so, we calculate the distance to each other cell. 
use grid_africa_final.dta, clear
keep if _m == 1
drop _m
count
keep gridcell latitude longitude
cross using grid_africa_rain
geodist latitude longitude latitude_coast longitude_coast, gen(dist)
* We keep the closest cell with a non-missing value. 
sort gridcell dist 
bysort gridcell: keep if _n == 1
keep gridcell prec_mean
ren prec_mean prec_mean_mis
sort gridcell 
save missing_rain, replace

* We add to the main data set. 
use grid_africa_final.dta, clear
drop _m
sort gridcell
merge gridcell using missing_rain
tab _m
drop _m
codebook prec_mean
* We replace the missing values by the value of the closest cell with a non-missing value.
replace prec_mean = prec_mean_mis if prec_mean == .
codebook prec_mean
drop prec_mean_mis
save grid_africa_final.dta, replace
count
* 203,574

** Distance to the Beach and Rooome stations **

** We do Beach first **
use grid_africa_final, clear
keep if beach03_pro_yn == 1
keep country longitude latitude
ren longitude longitude_beach 
ren latitude latitude_beach
count
sort country
save beach_coord, replace
* We obtain the distance. 
use grid_africa_final, clear
drop if beach03_pro_yn == 1
keep country gridcell longitude latitude
count
sort country
merge m:m country using beach_coord
tab _m
keep if _m == 3
drop _m
geodist latitude longitude latitude_beach longitude_beach, gen(dist)
collapse (min) dist, by(gridcell)
ren dist dist2beach 
sort gridcell 
save dist2beach, replace
* We add to the main data set. 
use grid_africa_final, clear
sort gridcell
merge gridcell using dist2beach
tab _m
drop _m
* We create distance dummies. 
foreach X in 15 20 50 {
gen beach03_pro_yn_`X' = (beach03_pro_yn == 1 | (dist2beach <= `X' & dist2beach != .))
tab beach03_pro_yn beach03_pro_yn_`X'
}
save grid_africa_final.dta, replace

** We do Roome next **
use grid_africa_final, clear
keep if roome24_mis_yn == 1
keep country longitude latitude
ren longitude longitude_beach 
ren latitude latitude_beach
count
sort country
save roome_coord, replace
codebook longitude latitude
* We obtain the distance. 
use grid_africa_final, clear
drop if roome24_mis_yn == 1
keep country gridcell longitude latitude
count
sort country
merge m:m country using roome_coord
tab _m
keep if _m == 3
drop _m
geodist latitude longitude latitude_beach longitude_beach, gen(dist)
collapse (min) dist, by(gridcell)
ren dist dist2roome
sort gridcell 
save dist2roome, replace
* We add to the main data set. 
use grid_africa_final, clear
sort gridcell
merge gridcell using dist2roome
tab _m
drop _m
* We create distance dummies. 
foreach X in 15 20 50 {
gen roome24_mis_yn_`X' = (roome24_mis_yn == 1 | (dist2roome <= `X' & dist2roome != .))
tab roome24_mis_yn roome24_mis_yn_`X'
}
save grid_africa_final.dta, replace

***** Night lights *****

* Created in the folder "Night lights"
use "Night lights\nlights_wotopcoding_byAfricangrid.dta", clear
keep descriptio *mean
ren descriptio gridcell
rename *_mean *
* In the regressions, we use c. 2010
* We thus keep these.
keep gridcell lli10*
sort gridcell
save nlights_wotopcoding_byAfricangrid2, replace

* We combine with the main data set *
use grid_africa_final, clear
sort gridcell
merge gridcell using nlights_wotopcoding_byAfricangrid2
tab _m
drop if _m == 2
drop _m
foreach X in 10 {
ren lli`X' nltc20`X'
}
foreach X in 1011 {
ren lli`X' nltc2011
}
* We create the log of it. 
gen lnltc2010 = log(nltc2010+1)
gen lnltc2011 = log(nltc2011+1)
label var nltc2010 "NTL (not top coded) 2010"
label var nltc2011 "NTL (not top coded) 2010-2011"
label var lnltc2010 "Log NTL (not top coded) 2010"
label var lnltc2011 "Log NTL (not top coded) 2010-2011"
sort gridcell
save grid_africa_final.dta, replace

***** We create more variables that we need for the analysis *****

use grid_africa_final, clear

** More variables related to the missions **
gen roome24_pro_yn = (protest_nunn >= 1)

** Railroads **
* Placebo lines 1916-1922.
egen dist2proposed191622 = rmin(dist2proposed1916nevbuilt dist2proposed1922nevbuilt)
* Military/mining lines.  
egen dist2rail1900_milmin = rmin(dist2rail1900_military dist2rail1900_mining)
egen dist2rail1924_milmin = rmin(dist2rail1924_military dist2rail1924_mining)
* Distance variables.
foreach X of varlist dist2rail1900 dist2rail1924 dist2rail1900_milmin dist2rail1924_milmin dist2emst dist2proposed191622 {
replace `X' = 5 if `X' <= 5
gen `X'30 = (`X' <= 30)
gen l`X'30 = log(`X'30)
}
* Placebo but cell not built because of another line. 
gen dist2proposed19162230_v2_00 = (dist2proposed19162230 == 1 & dist2rail190030 == 0)
gen dist2proposed19162230_v2_24 = (dist2proposed19162230 == 1 & dist2rail192430 == 0)

** Log of some of the distance variables **
foreach X in dist2coast dist2slaveport dist2river_nav dist2lake dist2explorer dist2rail1900 dist2rail1924 dist2nationalcity dist2muslimcentre dist2muslimbelt {
replace `X' = 5 if `X' <= 5
gen `X'10 = (`X' <= 10)
gen l`X' = log(`X')
}

** Initial population density 1800 **
gen density1800m = mpop1800/area_sqkm

** Dummy if city above 10K c. 1900 **
desc pop1900_10
gen citypop1900_10 = pop1900_10
replace citypop1900_10 = 1 if citypop1900_10 == . | citypop1900_10 == 0
gen lcitypop1900_10 = log(citypop1900_10)

** Log of mean urban population c. 1900 in HYDE **
replace urbc1900 = 1 if urbc1900 == 0
replace rurc1900 = 1 if rurc1900 == 0
gen lurbc1900 = log(urbc1900)
gen lrurc1900 = log(rurc1900)

** Centralization dummy in the Murdock atlas ** 
* An ethnic group is defined as "centralized" if it has 2, 3 or 4 jurisdictional levels
* See N. Gennaioli, I. Rainer. The Modern Impact of Precolonial Centralization in Africa. Published 1 November 2005. Indigenous Nations & Peoples Law eJournal.
gen centra = (polity == 2 | polity == 3 | polity == 4)

** Dummies for the year the Murdoch data is available for each cell **
gen yrstudy1900 = (year_anthro >= 1830 & year_anthro < 1900)
gen yrstudy1924 = (year_anthro >= 1830 & year_anthro < 1924)
gen yrstudy_yn = (year_anthro != 0)

** Cash crop production value **
* Export values from all crops *
gen cashc1900 = gridvalue1900
gen cashc1924 = gridvalue1924
drop gridvalue*
* We create the suitability dummies based on 7 crops * 
foreach X in cocoa coffee cotton groundnut oilpalm tea tobacco {
gen gaez_`X'_yn = (`X' > 0 & `X' != . & `X' != 0)
tab gaez_`X'_yn
}
* We know the number of times each cell is suitable. 
foreach X in cocoa coffee cotton groundnut oilpalm tea tobacco {
bysort col: egen sumgaez_`X'_yn = sum(gaez_`X'_yn)
sum sumgaez_`X'_yn, d
}
* We obtain the average production per crop. 
foreach Y in 1900 1924 {
foreach X in cocoa coffee cotton groundnut oilpalm tea tobacco {
gen cellvalue`Y'`X' = 0
replace cellvalue`Y'`X' = value`Y'`X'/sumgaez_`X'_yn if gaez_`X'_yn == 1
}
}
* We then obtain the sum of that. 
foreach Y in 1900 1924 {
egen cellvalue`Y'all = rsum(cellvalue`Y'*)
}
* We then take the logs. 
foreach Y in 1900 1924 {
replace cellvalue`Y'all = 1 if cellvalue`Y'all == 0 | cellvalue`Y'all == .
}
foreach Y in 1900 1924 {
gen lcellvalue`Y'all = log(cellvalue`Y'all)
}
* We also create if the cell is suitable for any crop. 
egen gaez_any_yn = rmax(gaez_*_yn)
save grid_africa_final, replace
count

*** Data set that we use for the panel below ***

use grid_africa_final, clear
keep gridcell cocoa coffee cotton groundnut oilpalm tea tobacco
sort gridcell
save temp_crops, replace

*** Information on early mission and colonizer ***

* We created a data set with information on the colonizer and the first year there was a mission and a modern mission, i.e. when missionary efforts really took off. 
* Various sources have been used for each country (Felix Meier zu Selhausen can provide details on the sources used). 
import excel "coloniser_early_mission_cleaned.xlsx", sheet("Dataset") firstrow clear
sort country
save coloniser_early_mission_cleaned, replace

* We add to the main data set.
use grid_africa_final, clear
sort country
merge country using coloniser_early_mission_cleaned
tab _m
drop _m
save grid_africa_final, replace

*** Information on the year the mission was opened in Beach 1900 ***

* Beach data with foundyr = the year of foundations
use newbeachdata, clear
sort gridcell
save newbeachdata, replace

* We add to the main data set. 
use grid_africa_final, clear
desc dist2slaveport_10km
sort gridcell
merge gridcell using newbeachdata
tab _m
drop if _m == 2
drop _m
* We create dummies if the mission had been opened by these years below. 
replace in_BEACH = 0 if in_BEACH == .
gen in_BEACH_1850 = 0
replace in_BEACH_1850 = 1 if in_BEACH == 1 & foundyr <= 1850 & foundyr != .
gen in_BEACH_1881 = 0
replace in_BEACH_1881 = 1 if in_BEACH == 1 & foundyr <= 1881 & foundyr != .
gen in_BEACH_1900 = 0
replace in_BEACH_1900 = 1 if in_BEACH == 1 & foundyr <= 1900 & foundyr != .
replace foundyr = 0 if foundyr  == .
drop lat_BEACH long_BEACH in_CR 
label var foundyr "Earliest foundation year of a Beach mission in the cell"
label var in_BEACH_1850 "Dummy if a Beach mission was founded before 1850"
label var in_BEACH_1881 "Dummy if a Beach mission was founded before 1881"
label var in_BEACH_1900 "Dummy if a Beach mission was founded before 1900"
* We add some variables 
gen roome24_num = (catholic_nunn+protest_nunn)
replace roome24_num = 0 if roome24_num == . 
gen lroome24_num = log(roome24_num+1)
replace protestant_cr_original = 0 if protestant_cr_original == .
gen lbeach03_pro_num = log(protestant_cr_original+1)
label var roome24_num "Number of Roome missions"
label var lroome24_num "Log number of Roome missions"
ren protestant_cr_original beach03_pro_num
label var beach03_pro_num "Number of Beach missions"
label var lbeach03_pro_num "Log number of Beach missions"
gen cath_yn = (catholic_nunn >= 1 & catholic_nunn != .)
label var cath_yn "Dummy if Catholic mission"
gen prot_yn = (protest_nunn >= 1 & protest_nunn != .)
label var prot_yn "Dummy if Protestant mission"
* For the missions in Beach, we know the denomination c. 1900.
* We add to the main data set. 
sort gridcell
merge gridcell using beach_denomgh
tab _m
drop if _m == 2
drop _m
replace other = 0 if other == .
replace metho = 0 if metho == .
replace presb = 0 if presb == .
replace luthe = 0 if luthe == .
replace angli = 0 if angli == .
gen mainline = (metho == 1 | presb == 1 | luthe == 1 | angli == 1)
label var mainline "Dummy if mainline Protestant mission"
label var other  "Dummy if non-mainline Protestant mission"
label var metho  "Dummy if methodist mission"
label var presb  "Dummy if presbyterian mission"
label var luthe  "Dummy if lutheran mission"
label var angli  "Dummy if anglican mission"
* We drop some variables we don't need to make the data set smaller
drop gridcell2 print1903 luthe angli *muslimbelt* lnalt_std pop1910-pop1990 mpop1900-spop1400 mines1914* dist2rail1914 lights* land *value1961* *value2000* *ude_rain dist2beach-roome24_mis_yn_50 ldist2emst30 gaez_any_yn *value*cocoa *value*coffee *value*cotton *value*groundnut *value*oilpalm *value*tea *value*tobacco *gaez*cocoa* *gaez*coffee* *gaez*cotton* *gaez*groundnut* *gaez*oilpalm* *gaez*tea* *gaez*tobacco* gaez_any_yn weight* value*groundnuts sumGAEZ* dist2muslimcentre10 *nevbuilt ldist2proposed19162230 *mission_yr coloniser_19* dist2lake10km n_mines* mines*_20km count_hyde coffee-groundnut grid_land share_land *nltc2011 lnrurc lnurbc coastal v99 area_grid col polity monogamy dist2explorer_nunn10km rurc0 urbc0 pop2010 dist2rail2000* dist2rail1960* ldist2rail190030 ldist2rail192430 ldist2rail1900_milmin30 ldist2rail1924_milmin30 lngridvalue* earliest_mission* 
* We label the remaining variables
label var lndist2muslimcentre "Log distance to a Muslim center"
label var ldist2muslimcentre "Log distance to a Muslim center"
label var ethnicfe "Ethnic FE"
label var countryfe "Country FE"
label var districtfe "District FE"
label var lndist2slavep "Log distance to a historical slave port"
label var longitude "longitude"
label var latitude "latitude"
label var beach03_pro_yn "Dummy if Beach mission"
label var roome24_mis_yn "Dummy if Roome mission"
label var area_sqkm "Cell area (sq km)"
label var prec_mean "Mean precipitation level"
label var dist2emst30 "Distance to the EMST <= 30 km"
label var dist2proposed191622 "Distance to proposed lines in 1916/22 - not yet built in 1916/22 (km)"
label var dist2proposed19162230 "Within 30 km from proposed lines in 1916/22"
label var dist2proposed19162230_v2_00 "Placebo line 16/22 for 1900, excl. cells with rail"
label var dist2proposed19162230_v2_24 "Placebo line 16/22 for 1924, excl. cells with rail"
label var centra "Centralized pre-colonial state (Murdock atlas)"
label var lurbc1900 "Log urban population 1900"
label var lrurc1900 "Log ruralpopulation 190"
label var dist2nationalcity10 "Distance to closest national city < 10 km"
label var ldist2nationalcity "Log distance to closest national city"
label var yrstudy1900 "Dummy if year of the anthropoligical survey used by Murdock <= 1900"
label var yrstudy1924 "Dummy if year of the anthropoligical survey used by Murdock <= 1924"
label var yrstudy_yn "Dummy if know year of the anthropoligical survey used by Murdock"
label var density1800m "Population density c. 1800"
label var roome24_pro_yn "Dummy if Protestant mission in Roome"
label var ldist2coast "Log distance to the coast"
label var dist2coast10 "Distance to the coast < 10 km"
label var dist2lake10 "Distance to lake < 10 km"
label var ldist2lake "Log distance to lake"
label var dist2explorer10 "Dummy if within 10 km from explorer route"
label var ldist2explorer "Log distance to explorer route"
label var dist2slaveport10 "Dummy if within 10 from slave port"
label var citypop1900_10 "City pop. (loc. >= 10K) c. 1900"
label var lcitypop1900_10 "Log city pop. (loc. >= 10K) c. 1900"
label var dist2rail1900_milmin "Distance to military-mining line (1900)"
label var dist2rail1924_milmin "Distance to military-mining line (1924)"
label var dist2rail1900_milmin30 "Distance to military-mining line (1900) < 30 km"
label var dist2rail1924_milmin30 "Distance to military-mining line (1924) < 30 km"
label var dist2rail190030 "Distance to line (1900) < 30 km"
label var dist2rail192430 "Distance to line (1924) < 30 km"
label var dist2rail190010 "Distance to line (1900) < 10 km"
label var dist2rail192410 "Distance to line (1924) < 10 km"
label var ldist2rail1900 "Log distance to rail (1900)"
label var ldist2rail1924 "Log distance to rail (1924)"
label var cashc1900 "Cash crop value 1900"
label var cashc1924 "Cash crop value 1924"
label var cellvalue1900all "Cash crop value 1900"
label var cellvalue1924all "Cash crop value 1924"
label var lcellvalue1900all "Log cash crop value 1900"
label var lcellvalue1924all "Log cash crop value 1924"
label var ldist2slaveport "Log distance to a slave port"
label var dist2river_nav10 "Distance to a river < 10 km"
label var ldist2river_nav "Log distance to a river"
label var lndist2coast "Log distance to the coast"
order gridcell longitude latitude country countryfe region* district* districtfe ethnic* ethnicfe area_sqkm *nltc2010 alt_* temp precip prec_mean malaria tsetse soil* *river* dist2coast ldist2coast lndist2coast dist2coast10 *slaveport* dist2river_nav10km *lake* missing_murdock yrstudy* year_anthro date_col nevercolonised centra slavery dist2slaveport_10km polygamy *muslim* *explorer* ln_export_area city1400 city1800 dist2city1400 dist2city1800 density1800m *urbc1900 *rurc1900 dist2nationalcity dist2nationalcity10 ldist2nationalcity citypop1900* lcitypop1900_10 pop1900_10 pop1900 pop2000 mines* dist2rail19* ldist2rail1900 ldist2rail1924 dist2proposed* dist2rail*_military dist2rail*_mining dist2rail*_milmin* *emst* cashc* *cellvalue* *beach03_pro_* foundyr in_BEACH* *roome24* n_missionnunn protest_nunn catholic_nunn mainline presb metho other prot_yn cath_yn
save grid_africa_final, replace
use grid_africa_final, clear
desc, f

***************************************
*** PANEL FOR THE RAILROAD ANALYSIS ***
***************************************

* We construct a panel for the period 1880-1900 *

** Number of missions in each cell and year **
foreach Z of numlist 1880(5)1900 {
use "beach1903_complete.dta", clear
drop if year == .
keep if year <= `Z'
gen count = 1
collapse (sum) count, by(gridcell)
ren count nummission
gen year = `Z'
sort gridcell
save temp`Z', replace
}
* We combine all years *
use temp1880, clear
foreach Z of numlist 1880(5)1900 {
append using temp`Z'
}
sort gridcell year 
save nummission, replace

* Mainline Protestant missions *
foreach Z in 1850 1860 1870 1875 1880 1890 1900 {
use "beach1903_complete.dta", clear
gen metho = .
gen presb = .
gen other = .
gen angli = .
gen luthe = .
replace other = 1 if mission_denom == "Adventist"
replace angli = 1 if mission_denom == "Anglican"
replace other = 1 if mission_denom == "Baptist"
replace presb = 1 if mission_denom == "Free Church"
replace other = 1 if mission_denom == "Independent"
replace luthe = 1 if mission_denom == "Lutheran"
replace metho = 1 if mission_denom == "Methodist"
replace other = 1 if mission_denom == "Multidenominational"
replace presb = 1 if mission_denom == "Presbyterian"
replace other = 1 if mission_denom == "Quaker"
replace other = 1 if mission_denom == "Undenominational"
replace metho = 1 if mission_denom == "United Brethern in Christ"
replace luthe = 1 if mission_denom == "United Church"
gen mainline = (metho == 1 | presb == 1 | luthe == 1 | angli == 1)
drop if year == .
keep if year <= `Z'
gen count = 1
keep if mainline == 1
collapse (max) count, by(gridcell)
ren count mainline
gen year = `Z'
sort gridcell
save temp`Z', replace
}
use temp1850, clear
foreach Z in 1860 1870 1875 1880 1890 1900 {
append using temp`Z'
}
sort gridcell year 
save mainline, replace

* Other Protestant missions *
foreach Z in 1850 1860 1870 1875 1880 1890 1900 {
use "beach1903_complete.dta", clear
gen metho = .
gen presb = .
gen other = .
gen angli = .
gen luthe = .
replace other = 1 if mission_denom == "Adventist"
replace angli = 1 if mission_denom == "Anglican"
replace other = 1 if mission_denom == "Baptist"
replace presb = 1 if mission_denom == "Free Church"
replace other = 1 if mission_denom == "Independent"
replace luthe = 1 if mission_denom == "Lutheran"
replace metho = 1 if mission_denom == "Methodist"
replace other = 1 if mission_denom == "Multidenominational"
replace presb = 1 if mission_denom == "Presbyterian"
replace other = 1 if mission_denom == "Quaker"
replace other = 1 if mission_denom == "Undenominational"
replace metho = 1 if mission_denom == "United Brethern in Christ"
replace luthe = 1 if mission_denom == "United Church"
gen mainline = (metho == 1 | presb == 1 | luthe == 1 | angli == 1)
drop if year == .
keep if year <= `Z'
gen count = 1
keep if mainline != 1
collapse (max) count, by(gridcell)
ren count otprot
gen year = `Z'
sort gridcell
save temp`Z', replace
}
use temp1850, clear
foreach Z in 1860 1870 1875 1880 1890 1900 {
append using temp`Z'
}
sort gridcell year 
save otprot, replace

** Cells with railroads at any point before 1900 **
* The file was created for Jedwab & Moradi 2016. 
use rail_yearbuilt, clear
keep gridcell year_built
keep if year_built <= 1900
sort gridcell
save cells_rail_ssa, replace
count
* 1436

** Countries with any of these cells **
use grid_africa_final, clear
keep country gridcell longitude latitude
sort gridcell
merge gridcell using cells_rail_ssa
tab _m
drop _m
keep if year_built != .
count
ren longitude lon_rail
ren latitude lat_rail
sort country
save cells_rail_ssa, replace
keep country
bysort country: keep if _n == 1
sort country
save list_cntries_cells_rail_ssa, replace
count
* 14

** We construct distance to a railroad line in year t **
* We focus on the years 1880, 1885, 1890, 1895, 1900
foreach X of numlist 1880(5)1900 {
use cells_rail_ssa, clear
keep if year_built <= `X' 
drop year_built
save temp, replace
keep country
bysort country: keep if _n == 1
sort country
save temp_list, replace
use grid_africa_final, clear
keep country gridcell longitude latitude
sort country
merge country using temp_list
keep if _m == 3
drop _m
joinby country using temp
drop country 
geodist latitude longitude lat_rail lon_rail, gen(dist)
collapse (min) dist, by(gridcell)
ren dist dist2rail_t
gen year = `X'
sort gridcell year
save temprail`X', replace
}

** We combine them ** 
use temprail1880, clear
foreach X of numlist 1885(5)1900 {
append using temprail`X'
}
sort gridcell year
save temprail, replace
* List of cells with a mission in 1900 *
use temp1900, clear
sort gridcell
save list_cntris_rail_pre1900, replace
clear
* Years used for the panel analysis * 
use yrs_for_panel_rail, clear
sort year
save yrs_for_panel_rail, replace

* We now create the panel * 
use grid_africa_final, clear
* We keep the country, cell ID, district and ethnic group
keep country gridcell district2000 ethnic
* We create the country-district. 
gen cntrydistrict2000 = country+district2000
label var country "Country"
label var gridcell "Grid cell"
label var district2000 "District name"
label var ethnic "Ethnic group"
label var cntrydistrict2000 "Country-district"
* We only keep the countries with railroads at one point.
sort country 
merge country using list_cntries_cells_rail_ssa
tab _m
keep if _m == 3
drop _m
* We multiply by the number of years needed for the analysis
cross using yrs_for_panel_rail
tab year
* We keep 1880-1900
keep if year >= 1880
* We create the country-years. 
gen cntryyr = country+string(year)
label var cntryyr "Country-year"
* We add the railroad data
sort gridcell year
merge gridcell year using temprail, update
drop if _m == 2
tab _m
drop _m 
* We then create a dummy if within 30 km
gen rail30 = (dist2rail_t <= 30 & dist2rail_t != .)
tab year rail30, row
* We see the increase over time
label var dist2rail_t "For railroad cells, distance to rail in t (km)"
label var rail30 "Dummy if within 30 km from railroad in t"
sort gridcell year
save panel_rail_africa, replace

* We add the mission data * 
use panel_rail_africa, clear
sort gridcell
merge gridcell using newbeachdata
tab _m
drop _m
* We drop if missing information on foundation year 
drop if in_BEACH == 1 & (foundyr == . | foundyr == 0)
gen mission_yn = (foundyr <= year & foundyr != . & foundyr != 0)
sort gridcell year 
merge gridcell year using nummission
tab _m
tab year if _m == 2
drop if _m == 2
drop _m
label var nummission "Number of missions in the cell in t"
replace nummission = 0 if nummission == .
gen lnummission = log(nummission+1)
label var lnummission "Log number of missions in the cell in t"
* We add the data on the denomination.
sort gridcell year 
merge gridcell year using mainline
tab _m
tab year if _m == 2
drop if _m == 2
drop _m
sort gridcell year 
merge gridcell year using otprot
tab _m
tab year if _m == 2
drop if _m == 2
drop _m
replace mainline = 0 if mainline == .
replace otprot = 0 if otprot == .
* Finally, we create the lags and leads for the railroad variable
sort gridcell year
bysort gridcell: gen lag1rail30 = rail30[_n-1]
bysort gridcell: gen lead1rail30 = rail30[_n+1]
label var lag1rail30 "Lag of the rail variable"
label var lead1rail30 "Lea of the rail variable"
label var mission_yn "Dummy if mission in year t"
drop l*_BEACH
* We label the remaining variables
label var mainline "Dummy if mainline Protestant mission"
label var otprot "Dummy if other Protestant mission"
label var cntrydistrict2000 "Country-district (2000)"
sort gridcell year
save panel_rail_africa, replace

****************************************
*** PANEL FOR THE CASH CROP ANALYSIS ***
****************************************

* We construct a panel for the period 1850-1900 *

* We first create the panel structure of the data. 
* List of countries 
use grid_africa_final, clear
keep country
bysort country: keep if _n == 1
save list_countries_for_crops, replace
* List of years
import excel "years_for_cash_crops.xlsx", sheet("Sheet1") firstrow clear
sort year
save years_for_cash_crops, replace
* We combine
use list_countries_for_crops, clear
cross using years_for_cash_crops
drop B
drop if year == .
sort country year
save list_countries_years_for_crops, replace

* We will add on the total cash crop export value of several colonies in each year. 
* We created the excel file below which shows the colony name corresponding to each modern country.
clear
import excel "list_countries_years_for_crops2.xlsx", sheet("Sheet1") firstrow clear
ren colony colonypre1900
* We drop the ones where we don't have any information or no recorded production.
drop if noinfo_drop == 1 | noprod == 1
codebook country
* 20 countries
tab year
* 8
sort country 
save list_countries_years_for_crops2, replace 

* List of countries. 
use list_countries_years_for_crops2, clear
bysort country: keep if _n == 1
keep country
sort country
save list_countries_for_crops2, replace 

* We add the production data.
* See the text for details on the sources. 
clear 
import excel "cash_crop_values_ssa_remi.xlsx", sheet("Sheet1") firstrow clear
* These are the crops.
tab cash_crop
* We exclude rubber to focus on the other crops. 
drop if cash_crop == "Rubber"
* We change the names. 
replace cash_crop= "cocoa" if cash_crop == "Cocoa"
replace cash_crop= "coffee" if cash_crop == "Coffee"
replace cash_crop= "cotton" if cash_crop == "Cotton"
replace cash_crop= "groundnut" if cash_crop == "Groundnuts"
replace cash_crop= "palmker" if cash_crop == "Palm kernels"
replace cash_crop= "oilpalm" if cash_crop == "Palm oil"
replace cash_crop= "oilpalm" if cash_crop == "Palmoil"
replace cash_crop= "tea" if cash_crop == "Tea"
replace cash_crop= "tobacco" if cash_crop == "Tobacco"
ren export_value_BP expval 
keep colony year expval cash_crop
* We reshape the data set.
reshape wide expval, i(colony year) j(cash_crop) string
ren colony colonypre1900 
* We only use 1900.
drop if year == 1924
sort colonypre1900 year
save cash_crop_values_pre1900, replace

* We start with the main data set. 
use grid_africa_final, clear
gen cntrydistrict2000 = country+district2000
* We add the Beach mission data
sort gridcell
merge gridcell using newbeachdata
tab _m
drop if _m == 2
drop _m
* We re-add the crop suitability variables *
sort gridcell
merge gridcell using temp_crops
tab _m
drop _m
** List of countries with cash crop data **
sort country 
merge country using list_countries_for_crops2
tab _m
keep if _m == 3
drop _m
count
joinby country using list_countries_years_for_crops2
drop if year == 1924
* We add the cash crop value data. 
sort colonypre1900 year
merge colonypre1900 year using cash_crop_values_pre1900
tab _m
tab colonypre1900 if _m == 2
drop if _m == 2
drop _m
* We change the name of palm oil kernels.
gen palmker = oilpalm
* We create the suitability dummies. 
foreach X in cocoa coffee cotton groundnut oilpalm palmker tea tobacco {
gen gaez0_`X'_yn = (`X' > 0 & `X' != . & `X' != 0)
gen gaez50_`X'_yn = (`X' > 5000 & `X' != . & `X' != 0)
}
drop cocoa coffee cotton groundnut oilpalm palmker tea tobacco
* We obtain the sum of suitability 
foreach X in cocoa coffee cotton groundnut oilpalm palmker tea tobacco {
bysort colonypre1900 year: egen sumgaez0_`X'_yn = sum(gaez0_`X'_yn)
bysort colonypre1900 year: egen sumgaez50_`X'_yn = sum(gaez50_`X'_yn)
}
* Average value in each cell for each crop 
foreach X in cocoa coffee cotton groundnut oilpalm palmker tea tobacco {
gen cellvalue0`X' = 0
replace cellvalue0`X' = expval`X'/sumgaez0_`X'_yn if gaez0_`X'_yn == 1
gen cellvalue50`X' = 0
replace cellvalue50`X' = expval`X'/sumgaez50_`X'_yn if gaez50_`X'_yn == 1
}
gen cellvalue0palm = 0
replace cellvalue0palm = (cellvalue0oilpalm + cellvalue0palmker) if gaez0_oilpalm_yn == 1 | gaez0_palmker_yn == 1
gen cellvalue50palm = 0
replace cellvalue50palm = (cellvalue50oilpalm + cellvalue50palmker) if gaez50_oilpalm_yn == 1 | gaez50_palmker_yn == 1
drop gaez*_*_yn
drop expval* sumgaez*_*_yn
* We drop if there is missing information on the foundation year. 
drop if in_BEACH == 1 & (foundyr == . | foundyr == 0)
gen mission_yn = (foundyr <= year & foundyr != . & foundyr != 0)
tab year mission_yn, row
* We obtain the sum of values across cells. 
collapse (sum) cellvalue* (max) mission_yn, by(country cntrydistrict2000 ethnic gridcell year longitude latitude)
* Creating the logs
foreach Z in 0 50 {
egen cellvalue`Z'all = rsum(cellvalue`Z'*)
foreach X in cocoa coffee cotton groundnut oilpalm palmker tea tobacco palm all {
replace cellvalue`Z'`X' = 1 if cellvalue`Z'`X' == 0 | cellvalue`Z'`X' == . | cellvalue`Z'`X' <= 1
gen lcellvalue`Z'`X' = log(cellvalue`Z'`X')
}
}
* We add the lags and leads. 
sort gridcell year
foreach Z in 0 50 {
bysort gridcell: gen lead1val`Z' = lcellvalue`Z'all[_n+1]
bysort gridcell: gen lag1val`Z' = lcellvalue`Z'all[_n-1]
}
drop cellvalue*
* Number of missions + mainline and other protestants * 
sort gridcell year 
merge gridcell year using nummission
tab _m
drop if _m == 2
drop _m
replace nummission = 0 if nummission == .
gen lnummission = log(nummission+1)
foreach Z in mainline otprot {
sort gridcell year 
merge gridcell year using `Z'
tab _m
drop if _m == 2
drop _m
replace `Z' = 0 if `Z' == .
}
* We add the country-year variables
gen cntryyr = country+string(year)
* We label the variables.
label var longitude "Longitude"
label var latitude "Latitude"
label var cntrydistrict2000 "Country-district (2000)"
label var mission_yn "Dummy if mission"
drop *cocoa *tea *palmker *palm *tobacco *coffee *cotton *groundnut 
label var lcellvalue0all "Log cash crop value based on suitability > 0%"
label var lcellvalue50all "Log cash crop value based on suitability > 50%"
label var lead1val0 "Lead of lcellvalue0all"
label var lag1val0 "Lag of lcellvalue0all"
label var lead1val50 "Lead of lcellvalue50all"
label var lag1val50 "Lag of lcellvalue50all"
label var nummission "Number of missions"
label var lnummission "Log number of missions"
label var mainline "Dummy if mainline Protestant mission"
label var otprot "Dummy if other Protestant mission"
label var cntryyr "Country-year"
order gridcell year longitude latitude cntryyr cntrydistrict2000
sort gridcell year
save panel_cashc_africa, replace
